{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ijGzTHJJUCPY"
      },
      "outputs": [],
      "source": [
        "# Copyright 2024 Google LLC\n",
        "#\n",
        "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
        "# you may not use this file except in compliance with the License.\n",
        "# You may obtain a copy of the License at\n",
        "#\n",
        "#     https://www.apache.org/licenses/LICENSE-2.0\n",
        "#\n",
        "# Unless required by applicable law or agreed to in writing, software\n",
        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
        "# See the License for the specific language governing permissions and\n",
        "# limitations under the License."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "VEqbX8OhE8y9"
      },
      "source": [
        "# Synthetic Data Generation using Gemini APIs\n",
        "\n",
        "<table align=\"left\">\n",
        "  <td style=\"text-align: center\">\n",
        "    <a href=\"https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/data-generation/synthetic_data_generation_using_gemini.ipynb\">\n",
        "      <img width=\"32px\" src=\"https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg\" alt=\"Google Colaboratory logo\"><br> Run in Colab\n",
        "    </a>\n",
        "  </td>\n",
        "  <td style=\"text-align: center\">\n",
        "    <a href=\"https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fuse-cases%2Fdata-generation%2Fsynthetic_data_generation_using_gemini.ipynb\">\n",
        "      <img width=\"32px\" src=\"https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN\" alt=\"Google Cloud Colab Enterprise logo\"><br> Run in Colab Enterprise\n",
        "    </a>\n",
        "  </td>    \n",
        "  <td style=\"text-align: center\">\n",
        "    <a href=\"https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/data-generation/synthetic_data_generation_using_gemini.ipynb\">\n",
        "      <img width=\"32px\" src=\"https://raw.githubusercontent.com/primer/octicons/refs/heads/main/icons/mark-github-24.svg\" alt=\"GitHub logo\"><br> View on GitHub\n",
        "    </a>\n",
        "  </td>\n",
        "  <td style=\"text-align: center\">\n",
        "    <a href=\"https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/use-cases/data-generation/synthetic_data_generation_using_gemini.ipynb\">\n",
        "      <img src=\"https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32\" alt=\"Vertex AI logo\"><br> Open in Vertex AI Workbench\n",
        "    </a>\n",
        "  </td>\n",
        "  <td style=\"text-align: center\">\n",
        "    <a href=\"https://goo.gle/4jdNeuB\">\n",
        "      <img width=\"32px\" src=\"https://cdn.qwiklabs.com/assets/gcp_cloud-e3a77215f0b8bfa9b3f611c0d2208c7e8708ed31.svg\" alt=\"Google Cloud logo\"><br> Open in  Cloud Skills Boost\n",
        "    </a>\n",
        "  </td>\n",
        "</table>\n",
        "\n",
        "<div style=\"clear: both;\"></div>\n",
        "\n",
        "<b>Share to:</b>\n",
        "\n",
        "<a href=\"https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/data-generation/synthetic_data_generation_using_gemini.ipynb\" target=\"_blank\">\n",
        "  <img width=\"20px\" src=\"https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg\" alt=\"LinkedIn logo\">\n",
        "</a>\n",
        "\n",
        "<a href=\"https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/data-generation/synthetic_data_generation_using_gemini.ipynb\" target=\"_blank\">\n",
        "  <img width=\"20px\" src=\"https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg\" alt=\"Bluesky logo\">\n",
        "</a>\n",
        "\n",
        "<a href=\"https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/data-generation/synthetic_data_generation_using_gemini.ipynb\" target=\"_blank\">\n",
        "  <img width=\"20px\" src=\"https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg\" alt=\"X logo\">\n",
        "</a>\n",
        "\n",
        "<a href=\"https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/data-generation/synthetic_data_generation_using_gemini.ipynb\" target=\"_blank\">\n",
        "  <img width=\"20px\" src=\"https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png\" alt=\"Reddit logo\">\n",
        "</a>\n",
        "\n",
        "<a href=\"https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/data-generation/synthetic_data_generation_using_gemini.ipynb\" target=\"_blank\">\n",
        "  <img width=\"20px\" src=\"https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg\" alt=\"Facebook logo\">\n",
        "</a>            \n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "fANaqaNOgK_N"
      },
      "source": [
        "| | |\n",
        "|-|-|\n",
        "|Author(s) | [Madhup Sukoon](https://github.com/vagrantism) |"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "CkHPv2myT2cx"
      },
      "source": [
        "## Overview\n",
        "\n",
        "### Gemini\n",
        "\n",
        "Gemini is a family of generative AI models developed by Google DeepMind that is designed for multimodal use cases. The Gemini API gives you access to the Gemini model.\n",
        "\n",
        "### Gemini API in Vertex AI\n",
        "\n",
        "The Gemini API in Vertex AI provides a unified interface for interacting with Gemini models.\n",
        "\n",
        "- **Gemini**: Designed to handle natural language tasks, multi-turn text and code chat, and code generation.\n",
        "\n",
        "You can interact with the Gemini API using the following methods:\n",
        "\n",
        "- Use [Vertex AI Studio](https://cloud.google.com/generative-ai-studio) for quick testing and command generation\n",
        "- Use cURL commands\n",
        "- Use the Vertex AI SDK\n",
        "\n",
        "This notebook focuses on using the **Vertex AI SDK for Python** to call the Gemini API in Vertex AI for the Gemini model.\n",
        "\n",
        "For more information, see the [Generative AI on Vertex AI](https://cloud.google.com/vertex-ai/docs/generative-ai/learn/overview) documentation.\n",
        "\n",
        "### Snowfakery\n",
        "\n",
        "[Snowfakery](https://snowfakery.readthedocs.io/en/docs/index.html) is a framework for generating complex fake data at scale. It has a lot of in-built plugins, and allows you to extend its functionality by writing your own plugins.\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "DrkcqHrrwMAo"
      },
      "source": [
        "### Objectives\n",
        "\n",
        "This repository demonstrates how Gemini can be leveraged as a Snowfakery plugin for generating synthetic data based on a set of predefined schemas and data generation strategies. The framework is based on [Snowfakery](https://snowfakery.readthedocs.io/) which is itself based on [Faker](https://faker.readthedocs.io/). It requires the expected outputs to be codified in a YAML file per Snowfakery specs, detailing all the required fields and their respective data generation strategies. The framework currently supports 3 such strategies:\n",
        "\n",
        "1. Static Values - can be included in the YAML file itself.\n",
        "2. Faker based values - Leverages the Faker library to generate fake data.\n",
        "3. LLM based values - Leverages an LLM call and a predefined prompt template to generate data\n",
        "\n",
        "It is also possible to use arbitrary python functions to generate data and augment the pipeline. Interrelated schemas are also supported where the value of a given field depends on an already defined field, which allows us to create hierarchical data and complex schemas. The data generated via this framework is saved to a CSV file for further analysis / consumption.\n",
        "\n",
        "Although this notebook can be used for any synthetic-data generation use-case and schema, the current examples shows the simple use case of generating long (blogs) and short (comments) format contents using a given Wikipedia page as the seed data. While the primary purpose of the synthetic data generation pipeline is to generate data for testing, this can also be used to support tangential use-cases like running prompt experiments and comparisons at scale, building few-shot examples, evaluating fine-tuned models, etc."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "C9nEPojogw-g"
      },
      "source": [
        "### Costs\n",
        "\n",
        "This tutorial uses billable components of Google Cloud:\n",
        "\n",
        "- Vertex AI\n",
        "\n",
        "Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "r11Gu7qNgx1p"
      },
      "source": [
        "## Getting Started\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "No17Cw5hgx12"
      },
      "source": [
        "### Install Vertex AI SDK for Python and other dependencies\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "id": "tFy3H3aPgx12"
      },
      "outputs": [],
      "source": [
        "%pip install --upgrade --user -q google-cloud-aiplatform snowfakery==3.6.2 wikipedia-api==0.6.0"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "dmWOrTJ3gx13"
      },
      "source": [
        "### Authenticate your notebook environment (Colab only)\n",
        "\n",
        "If you are running this notebook on Google Colab, run the following cell to authenticate your environment. This step is not required if you are using [Vertex AI Workbench](https://cloud.google.com/vertex-ai-workbench).\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "NyKGtVQjgx13"
      },
      "outputs": [],
      "source": [
        "import sys\n",
        "\n",
        "# Additional authentication is required for Google Colab\n",
        "if \"google.colab\" in sys.modules:\n",
        "    # Authenticate user to Google Cloud\n",
        "    from google.colab import auth\n",
        "\n",
        "    auth.authenticate_user()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "DF4l8DTdWgPY"
      },
      "source": [
        "### Set Google Cloud project information and initialize Vertex AI SDK\n",
        "\n",
        "To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).\n",
        "\n",
        "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {
        "id": "Nqwi-5ufWp_B"
      },
      "outputs": [],
      "source": [
        "import os\n",
        "\n",
        "PROJECT_ID = \"[your-project-id]\"  # @param {type: \"string\", placeholder: \"[your-project-id]\", isTemplate: true}\n",
        "if not PROJECT_ID or PROJECT_ID == \"[your-project-id]\":\n",
        "    PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n",
        "\n",
        "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")\n",
        "\n",
        "# Initialize Vertex AI\n",
        "import vertexai\n",
        "\n",
        "vertexai.init(project=PROJECT_ID, location=LOCATION)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "jXHfaVS66_01"
      },
      "source": [
        "### Import libraries\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {
        "id": "lslYAvw37JGQ"
      },
      "outputs": [],
      "source": [
        "from io import StringIO\n",
        "import logging\n",
        "import sys\n",
        "import types\n",
        "\n",
        "import jinja2\n",
        "from snowfakery import generate_data\n",
        "from snowfakery.plugins import SnowfakeryPlugin\n",
        "from vertexai.generative_models import GenerationConfig, GenerativeModel\n",
        "import wikipediaapi"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "OK6TsnYghrQk"
      },
      "source": [
        "## Creating Plugins and Prompts\n",
        "\n",
        "The following cells create the 2 custom plugins we need for this use case along with the needed prompts."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "5Hg7cMyOIyP7"
      },
      "source": [
        "### Creating Plugins\n",
        "The first plugin gives us the ability to interact with Wikipedia and fetch the contents for a given page. The second plugin allows us to interact with the Gemini API."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 10,
      "metadata": {
        "id": "4-VrhlHuEfnJ"
      },
      "outputs": [],
      "source": [
        "class SyntheticDataGeneration:\n",
        "    \"\"\"\n",
        "    Implements all the extra functionality needed for this use-case\n",
        "    \"\"\"\n",
        "\n",
        "    class Plugins(types.ModuleType):\n",
        "        \"\"\"\n",
        "        Provides the plugins needed to extend Snowfakery\n",
        "        \"\"\"\n",
        "\n",
        "        class Gemini(SnowfakeryPlugin):\n",
        "            \"\"\"\n",
        "            Plugin for interacting with Gemini.\n",
        "            \"\"\"\n",
        "\n",
        "            class Functions:\n",
        "                \"\"\"\n",
        "                Functions to implement field / object level data generation\n",
        "                \"\"\"\n",
        "\n",
        "                model: GenerativeModel = GenerativeModel(\"gemini-2.0-flash\")\n",
        "\n",
        "                def fill_prompt(self, prompt_name: str, **kwargs) -> str:\n",
        "                    \"\"\"\n",
        "                    Returns a formatted prompt\n",
        "                    \"\"\"\n",
        "                    return (\n",
        "                        jinja2.Environment(\n",
        "                            loader=jinja2.FileSystemLoader(searchpath=\"./\")\n",
        "                        )\n",
        "                        .get_template(prompt_name)\n",
        "                        .render(**kwargs)\n",
        "                    )\n",
        "\n",
        "                def generate(\n",
        "                    self,\n",
        "                    prompt_name: str,\n",
        "                    temperature=0.9,\n",
        "                    top_p=1,\n",
        "                    **kwargs,\n",
        "                ) -> str | None:\n",
        "                    \"\"\"\n",
        "                    A wrapper around Gemini plugin\n",
        "                    \"\"\"\n",
        "                    logging.info(\"Preparing Prompt %s with %s\", prompt_name, kwargs)\n",
        "                    prompt = self.fill_prompt(prompt_name, **kwargs)\n",
        "                    logging.info(\"Prompt %s Prepared\", prompt_name)\n",
        "                    try:\n",
        "                        logging.info(\"Calling Gemini For %s\", prompt_name)\n",
        "                        response = self.model.generate_content(\n",
        "                            prompt,\n",
        "                            generation_config=GenerationConfig(\n",
        "                                temperature=temperature,\n",
        "                                max_output_tokens=8192,\n",
        "                                top_p=top_p,\n",
        "                            ),\n",
        "                        )\n",
        "                    except Exception as e:\n",
        "                        logging.trace(\n",
        "                            (\n",
        "                                \"Unable to generate text using %s.\\n\"\n",
        "                                \"Prepared Prompt: \\n%s\\n\\nError: %s\"\n",
        "                            ),\n",
        "                            prompt_name,\n",
        "                            prompt,\n",
        "                            e,\n",
        "                        )\n",
        "                        return None\n",
        "\n",
        "                    try:\n",
        "                        return response.text\n",
        "                    except Exception as e:\n",
        "                        logging.trace(\n",
        "                            (\n",
        "                                \"Unable to generate text using %s.\\n\"\n",
        "                                \"Prepared Prompt: \\n%s\\n\\n\"\n",
        "                                \"Received Response: \\n%s\\n\\n\"\n",
        "                                \"Error: %s\"\n",
        "                            ),\n",
        "                            prompt_name,\n",
        "                            prompt,\n",
        "                            response,\n",
        "                            e,\n",
        "                        )\n",
        "                        return None\n",
        "\n",
        "        class Wikipedia(SnowfakeryPlugin):\n",
        "            \"\"\"\n",
        "            Plugin for interacting with Wikipedia.\n",
        "            \"\"\"\n",
        "\n",
        "            class Functions:\n",
        "                \"\"\"\n",
        "                Implements a single function to fetch a Wikipedia page\n",
        "                \"\"\"\n",
        "\n",
        "                def get_page(self, title: str):\n",
        "                    \"\"\"\n",
        "                    Returns the title, URL and sections of the given wikipedia page\n",
        "                    \"\"\"\n",
        "                    logging.info(\"Parsing Wikipedia Page %s\", title)\n",
        "                    page = wikipediaapi.Wikipedia(\n",
        "                        \"Snowfakery (example@google.com)\", \"en\"\n",
        "                    ).page(title)\n",
        "                    results = {\"sections\": {}, \"title\": page.title, \"url\": page.fullurl}\n",
        "                    sections = [(s.title, s) for s in page.sections]\n",
        "                    while sections:\n",
        "                        sec_title, sec_obj = sections.pop()\n",
        "                        if sec_title in [\n",
        "                            \"External links\",\n",
        "                            \"References\",\n",
        "                            \"See also\",\n",
        "                            \"Further reading\",\n",
        "                        ]:\n",
        "                            continue\n",
        "                        if sec_obj.text:\n",
        "                            results[\"sections\"][sec_title] = sec_obj.text\n",
        "                        for sub_sec in sec_obj.sections:\n",
        "                            sections.append((f\"{sec_title} - {sub_sec.title}\", sub_sec))\n",
        "                    logging.info(\"Parsing Wikipedia Page %s Complete\", title)\n",
        "                    return results"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "_wDlaPt4I9lI"
      },
      "source": [
        "### Making plugins discoverable\n",
        "\n",
        "We add the created class to `sys.modules` to ensure Snowfakery can find them and import them as modules as needed."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 11,
      "metadata": {
        "id": "uDbBxdLQJFjg"
      },
      "outputs": [],
      "source": [
        "sys.modules[\"SyntheticDataGeneration.Plugins\"] = SyntheticDataGeneration.Plugins(\n",
        "    name=\"SyntheticDataGeneration.Plugins\"\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ML96RDV6L-St"
      },
      "source": [
        "### Creating Prompt Templates\n",
        "\n",
        "We add the created class to `sys.modules` to ensure Snowfakery can find them and import them as modules as needed."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 12,
      "metadata": {
        "id": "L05Um44dMFWN"
      },
      "outputs": [],
      "source": [
        "%%writefile blog_generator.jinja\n",
        "You are an expert content creator who writes detailed, factual blogs.\n",
        "You have been asked to write a blog about {{idea_title}}.\n",
        "To get you started, you have also been given the following context about the topic:\n",
        "\n",
        "{{idea_body}}\n",
        "\n",
        "Ensure the blog that you write is interesting,detailed and factual.\n",
        "Take a deep breath and start writing:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 13,
      "metadata": {
        "id": "PGa_a8GFO0dO"
      },
      "outputs": [],
      "source": [
        "%%writefile comment_generator.jinja\n",
        "You are {{first_name}} {{last_name}}. You are {{age}} years old. You are interested in {{interests}}. You work at {{organization}} as a {{profession}}.\n",
        "You came across the following article:\n",
        "\n",
        "{{blog_title}}\n",
        "\n",
        "{{blog_body}}\n",
        "\n",
        "Present your thoughts and feelings about the article in a short comment.\n",
        "\n",
        "Comment:"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "FTMywdzUORIA"
      },
      "source": [
        "## Creating the Recipe\n",
        "In order to generate synthetic data, the schema of the synthetic data must be defined first. This is done by creating a `recipe` in a YAML format as demonstrated below, more details on writing recipes can be found [here](https://snowfakery.readthedocs.io/en/latest/#central-concepts)."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 14,
      "metadata": {
        "id": "lRyTw2iPhEXG"
      },
      "outputs": [],
      "source": [
        "recipe = \"\"\"\n",
        "- plugin: SyntheticDataGeneration.Plugins.Wikipedia\n",
        "- plugin: SyntheticDataGeneration.Plugins.Gemini\n",
        "- option: wiki_title\n",
        "- var: __seed\n",
        "  value:\n",
        "    - Wikipedia.get_page :\n",
        "      title : ${{wiki_title}}\n",
        "\n",
        "- object : users\n",
        "  count : ${{random_number(min=100, max=500)}}\n",
        "  fields :\n",
        "    first_name : ${{fake.FirstName}}\n",
        "    last_name : ${{fake.FirstName}}\n",
        "    age:\n",
        "      random_number:\n",
        "        min: 18\n",
        "        max: 95\n",
        "    email : ${{fake.Email}}\n",
        "    phone : ${{fake.PhoneNumber}}\n",
        "    interests : ${{fake.Bs}}\n",
        "    postal_code : ${{fake.Postalcode}}\n",
        "    organization : ${{fake.Company}}\n",
        "    profession : ${{fake.Job}}\n",
        "\n",
        "- object : seeds\n",
        "  fields :\n",
        "    title : ${{__seed['title']}}\n",
        "    url : ${{__seed['url']}}\n",
        "    section_count : ${{__seed['sections'] | length}}\n",
        "\n",
        "  friends:\n",
        "    - object : blog_ideas\n",
        "      count : ${{seeds.section_count}}\n",
        "      fields :\n",
        "        seed_id : ${{seeds.id}}\n",
        "        section : ${{(__seed.sections.keys() | list)[child_index]}}\n",
        "        body : ${{__seed.sections[section]}}\n",
        "\n",
        "      friends:\n",
        "        - object : blog_posts\n",
        "          fields :\n",
        "            blog_idea_id : ${{blog_ideas.id}}\n",
        "            title : ${{seeds.title}} - ${{blog_ideas.section}}\n",
        "            body :\n",
        "              - Gemini.generate:\n",
        "                prompt_name : blog_generator.jinja\n",
        "                idea_title : ${{title}}\n",
        "                idea_body : ${{blog_ideas.body}}\n",
        "            author : Gemini\n",
        "\n",
        "          friends:\n",
        "            - object : blog_post_comments\n",
        "              fields :\n",
        "                blog_post_id : ${{blog_posts.id}}\n",
        "                author_id :\n",
        "                  random_reference : users\n",
        "                author_email : ${{author_id.email}}\n",
        "                comment :\n",
        "                  - Gemini.generate:\n",
        "                    prompt_name : comment_generator.jinja\n",
        "                    first_name : ${{author_id.first_name}}\n",
        "                    last_name : ${{author_id.last_name}}\n",
        "                    age : ${{author_id.age}}\n",
        "                    interests : ${{author_id.interests}}\n",
        "                    organization : ${{author_id.organization}}\n",
        "                    profession : ${{author_id.profession}}\n",
        "                    blog_title : ${{blog_posts.title}}\n",
        "                    blog_body : ${{blog_posts.body | truncate(1000)}}\n",
        "\"\"\""
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "nwKHSwZfPgXn"
      },
      "source": [
        "## Generating Data"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 15,
      "metadata": {
        "id": "WhSganUbPive"
      },
      "outputs": [],
      "source": [
        "generate_data(\n",
        "    StringIO(recipe),\n",
        "    output_format=\"csv\",\n",
        "    output_folder=\"outputs\",\n",
        "    user_options={\"wiki_title\": \"Python_(programming_language)\"},\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "9N4XC09Uc-oH"
      },
      "source": [
        "### Results\n",
        "\n",
        "The synthetic data has been generated and stored as CSV files in the `outputs` folder."
      ]
    }
  ],
  "metadata": {
    "colab": {
      "name": "synthetic_data_generation_using_gemini.ipynb",
      "toc_visible": true
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
